Set working directory and load necessary packages.
Load the data for March. Load each file and combine into one data set. We load and combine each file in H:/Projects/11000/11155/TraffStudy/DataCollection/FreewayData/Detector Data/Volume Data/Day_selection. We only want to load the files that end in “_#.csv“.
[1] "Combined_March.csv" "March_1.csv" "March_10.csv" "March_11.csv"
[5] "March_12.csv" "March_18" "March_2.csv" "March_3.csv"
[9] "March_4.csv" "March_5.csv" "March_6.csv" "March_7.csv"
[13] "March_8.csv" "March_9.csv"
The column names in each file are also follows.
[1] "X" "X.1" "X.2" "X12.15.AM" "X12.30.AM" "X12.45.AM" "X1.AM"
[8] "X1.15.AM" "X1.30.AM" "X1.45.AM" "X2.AM" "X2.15.AM" "X2.30.AM" "X2.45.AM"
[15] "X3.AM" "X3.15.AM" "X3.30.AM" "X3.45.AM" "X4.AM" "X4.15.AM" "X4.30.AM"
[22] "X4.45.AM" "X5.AM" "X5.15.AM" "X5.30.AM" "X5.45.AM" "X6.AM" "X6.15.AM"
[29] "X6.30.AM" "X6.45.AM" "X7.AM" "X7.15.AM" "X7.30.AM" "X7.45.AM" "X8.AM"
[36] "X8.15.AM" "X8.30.AM" "X8.45.AM" "X9.AM" "X9.15.AM" "X9.30.AM" "X9.45.AM"
[43] "X10.AM" "X10.15.AM" "X10.30.AM" "X10.45.AM" "X11.AM" "X11.15.AM" "X11.30.AM"
[50] "X11.45.AM" "Noon" "X12.15.PM" "X12.30.PM" "X12.45.PM" "X1.PM" "X1.15.PM"
[57] "X1.30.PM" "X1.45.PM" "X2.PM" "X2.15.PM" "X2.30.PM" "X2.45.PM" "X3.PM"
[64] "X3.15.PM" "X3.30.PM" "X3.45.PM" "X4.PM" "X4.15.PM" "X4.30.PM" "X4.45.PM"
[71] "X5.PM" "X5.15.PM" "X5.30.PM" "X5.45.PM" "X6.PM" "X6.15.PM" "X6.30.PM"
[78] "X6.45.PM" "X7.PM" "X7.15.PM" "X7.30.PM" "X7.45.PM" "X8.PM" "X8.15.PM"
[85] "X8.30.PM" "X8.45.PM" "X9.PM" "X9.15.PM" "X9.30.PM" "X9.45.PM" "X10.PM"
[92] "X10.15.PM" "X10.30.PM" "X10.45.PM" "X11.PM" "X11.15.PM" "X11.30.PM" "X11.45.PM"
[99] "Midnight" "X.3" "filename"
Lets fix the column headers first. There are no headers for the first 3 columns and also there appears to be a column at the end of the data frame “X.3”
summary(March_18_dat$X.3)
Mode NA's
logical 39990
All NAs…. remove.
Focus on the first three columns;
'data.frame': 39990 obs. of 3 variables:
$ X : int 1180 1180 1180 1180 1180 1180 1180 1180 1180 1180 ...
$ X.1: chr "Density" "Density" "Density" "Density" ...
$ X.2: chr "2018/03/01" "2018/03/02" "2018/03/03" "2018/03/04" ...
Column 1 is the detector ID.
Column 2 is the metric.
Column 3 is the date.
Now lets change the data types in the columns.
We create a correctly formatted date field from the ‘Date’ field.
We label the field with displays the records type i.e. Density vs Speed vs Volume as ‘Metric’ and change it from a string to a factor field.
We also change Detector_ID from a numeric field to a factor field.
colnames(March_18_dat)
[1] "Detector_ID" "Metric" "Date" "Date_Posixct" "filename" "X12.15.AM" "X12.30.AM" "X12.45.AM"
[9] "X1.00.AM" "X1.15.AM" "X1.30.AM" "X1.45.AM" "X2.00.AM" "X2.15.AM" "X2.30.AM" "X2.45.AM"
[17] "X3.00.AM" "X3.15.AM" "X3.30.AM" "X3.45.AM" "X4.00.AM" "X4.15.AM" "X4.30.AM" "X4.45.AM"
[25] "X5.00.AM" "X5.15.AM" "X5.30.AM" "X5.45.AM" "X6.00.AM" "X6.15.AM" "X6.30.AM" "X6.45.AM"
[33] "X7.00.AM" "X7.15.AM" "X7.30.AM" "X7.45.AM" "X8.00.AM" "X8.15.AM" "X8.30.AM" "X8.45.AM"
[41] "X9.00.AM" "X9.15.AM" "X9.30.AM" "X9.45.AM" "X10.00.AM" "X10.15.AM" "X10.30.AM" "X10.45.AM"
[49] "X11.00.AM" "X11.15.AM" "X11.30.AM" "X11.45.AM" "X1.00.PM" "X12.15.PM" "X12.30.PM" "X12.45.PM"
[57] "X2.00.PM" "X1.15.PM" "X1.30.PM" "X1.45.PM" "X3.00.PM" "X2.15.PM" "X2.30.PM" "X2.45.PM"
[65] "X4.00.PM" "X3.15.PM" "X3.30.PM" "X3.45.PM" "X5.00.PM" "X4.15.PM" "X4.30.PM" "X4.45.PM"
[73] "X6.00.PM" "X5.15.PM" "X5.30.PM" "X5.45.PM" "X7.00.PM" "X6.15.PM" "X6.30.PM" "X6.45.PM"
[81] "X8.00.PM" "X7.15.PM" "X7.30.PM" "X7.45.PM" "X9.00.PM" "X8.15.PM" "X8.30.PM" "X8.45.PM"
[89] "X10.00.PM" "X9.15.PM" "X9.30.PM" "X9.45.PM" "X11.00.PM" "X10.15.PM" "X10.30.PM" "X10.45.PM"
[97] "X00.00.AM" "X11.15.PM" "X11.30.PM" "X11.45.PM" "X00.00.PM"
There noon and midnight are labelled as strings. The rest of the columns are labelled as times formats (in strings) but are also inconsistent in format. Lets change noon and midnight to the same format as the other times. We also want to create a uniform format across all the column names.
Melt the data. We want to change the data set from a wide format to a long format. Below are the top 6 rows.
Filter to the PM Peak hours, 2pm to 7pm.
PM_Peak <- c(14:19)
March_18_melt_PM_Peak <- March_18_melt %>%
filter(Hour>=14)%>%
filter(Hour<=19)
The new filtered data set only includes data between hour 14 and hour 19.
Lets find what detectors have bad values. We will create a list of Detector that contain records listed as ‘-1’. We will omit these detectors from our intial analysis.
var <- March_18_melt_PM_Peak %>%
group_by(Detector_ID)%>%
summarise(NAs =sum(Metric_value<0))%>%
arrange(NAs)%>%
filter(NAs==0)%>%
droplevels()
no_NAs <- as.vector(var$Detector_ID)
no_NAs
[1] "70" "72" "105" "115" "119" "147" "148" "149" "150" "151" "157" "158" "159" "160" "162" "232" "233" "234"
[19] "236" "237" "239" "240" "241" "242" "243" "244" "245" "246" "247" "248" "249" "250" "251" "252" "253" "271"
[37] "272" "273" "274" "275" "276" "277" "278" "279" "280" "281" "282" "287" "288" "289" "290" "291" "292" "351"
[55] "354" "355" "389" "406" "458" "459" "460" "462" "463" "464" "465" "466" "467" "487" "541" "545" "571" "572"
[73] "573" "577" "579" "582" "583" "591" "729" "806" "807" "817" "818" "819" "820" "821" "822" "823" "824" "825"
[91] "826" "827" "828" "829" "830" "831" "832" "833" "834" "836" "837" "838" "839" "840" "841" "842" "849" "850"
[109] "851" "852" "853" "854" "855" "856" "857" "858" "862" "863" "867" "868" "869" "870" "871" "872" "873" "874"
[127] "875" "876" "877" "878" "879" "880" "881" "882" "883" "884" "885" "886" "887" "894" "895" "896" "995" "996"
[145] "997" "998" "999" "1000" "1001" "1176" "1177" "1179" "1180" "1181" "1182" "1183" "1184" "1185" "1186" "1187" "1192" "1195"
[163] "1196" "1197" "1198" "1199" "1200" "1206" "1207" "1208" "1209" "1210" "1211" "1212" "1214" "1215" "1217" "1218" "1219" "1220"
[181] "1222" "1223" "1224" "1225" "1231" "1232" "1233" "1234" "1235" "1236" "1237" "1238" "1247" "1248" "1249" "1250" "1251" "1252"
[199] "1255" "1256" "1257" "1258" "1259" "1260" "1261" "1262" "1298" "1299" "1300" "1301" "1302" "1303" "1304" "1305" "1435" "1461"
[217] "1467" "1468" "1472" "1473" "1474" "1484" "1485" "1486" "1487" "1490" "1492" "1493" "1494" "1496" "1497" "1499" "1500" "1501"
[235] "1502" "1504" "1505" "1506" "1507" "1508" "1529" "1530" "1531" "1532" "1533" "1534" "1535" "1536" "1538" "1540" "1541" "1544"
[253] "1545" "1546" "1547" "1548" "1549" "1550" "1551" "1552" "1553" "1590" "1591" "1592" "1597" "1598" "1600" "1601" "1602" "1603"
[271] "1604" "1605" "1606" "1607" "1608" "1609" "1610" "1611" "1612" "1613" "1619" "1620" "1621" "3290" "3291" "3324" "3325" "3326"
[289] "3333" "3334" "3337" "3338" "3339" "3340" "3406" "3407" "3409" "3419" "3927" "3929" "3930" "3931" "3932" "3933" "3934" "3941"
[307] "3942" "3943" "3944" "3945" "3946" "3953" "4068" "4069" "4070" "4071" "4072" "4153" "4154" "4155" "4156" "4157" "4158" "5184"
[325] "5185" "5186" "5187" "5192" "5193" "5194" "5666" "5667" "5668" "5690" "5691" "5692" "5693" "6298" "6797" "6816" "6817" "6818"
[343] "6908" "6909" "6911" "6912" "6913" "6914" "6915" "6916" "6917" "6918" "6919" "6920" "6921" "6922" "6923" "6924" "6925" "7100"
[361] "7101" "7102" "7105" "7106" "7109" "7111" "7529" "7531" "7532" "7533" "7534" "7535" "7536" "7537" "7538" "7716" "7717" "7718"
There are 378 detectors that do not contain a -1 value.
Time Series
Lets pick Detector 70. We want to create a variable for each day and them sum the volumes for each day.
Now we create a time series for the data.
We decompose the Time Series to extract the trend, seasonality and remainder from the time series.

Day selection is ?.
Correlation
Show plots of volumes by day
y <- March_18_melt_PM_Peak %>%
mutate(Weekday = weekdays(Date_Time,abbreviate=TRUE),
YearDay=yday(Date_Time),
MonthDay=day(Date_Time),
HourMin = as.numeric(format(Date_Time,"%H.%M")))%>%
filter(Metric=='Volume')%>%
filter(Detector_ID%in%c(147:151))%>%
filter(Weekday %in% c('Tue','Wed','Thu'))
ggplot(y,aes(x=Metric_value,color=as.factor(Detector_ID)))+
geom_density()+facet_wrap(~MonthDay,nrow=4,scales = 'free_x')
Create detector ~ day heatmap. Each cell represents the covariance of the difference between each measurement. We have picked 100 detectors from the list for display purposes. All of the detectors shown do not contain any -1 values.
Invalid value for Colv, ignoring
Cluster Days
day_SDev <- March_18_melt_PM_Peak %>%
mutate(Weekday = weekdays(Date_Time,abbreviate=TRUE),
YearDay=yday(Date_Time),
MonthDay=day(Date_Time),
HourMin = as.numeric(format(Date_Time,"%H.%M")))%>%
filter(Metric=='Volume')%>%
group_by(Detector_ID,Date_Posixct)%>%
summarise(SumVol = sum(Metric_value))%>%
group_by(Date_Posixct)%>%
summarise(SDev = sd(SumVol))%>%
mutate(weekDay=weekdays(Date_Posixct,abbreviate=TRUE))
sundays <-day_SDev$Date_Posixct[day_SDev$weekDay%in%c('Sun')]
ggplot(day_SDev) +
geom_point(aes(Date_Posixct,SDev)) +
geom_rug(aes(Date_Posixct,SDev)) +
geom_vline(xintercept = sundays,linetype='dotted') +
theme_tufte(ticks = F) +
xlab("STD of the total volume across all Detectors") +
ylab("Date") +
theme(axis.title.x = element_text(vjust=-0.5), axis.title.y = element_text(vjust=1))
Cumulative running total
g <- March_18_melt_PM_Peak %>%
mutate(Weekday = weekdays(Date_Time,abbreviate=TRUE),
YearDay=yday(Date_Time),
MonthDay=day(Date_Time),
HourMin = as.numeric(format(Date_Time,"%H"))+
as.numeric(format(Date_Time,"%M"))/60)%>%
filter(Detector_ID==4069 & Metric=='Volume')%>%
# filter(YearDay==65 & Metric=='Volume')%>%
# filter(Detector_ID %in% no_NAs)%>%
# filter(Metric=='Volume')%>%
droplevels()%>%
group_by(YearDay)%>%
arrange(YearDay,HourMin)%>%
# group_by(Detector_ID,Date_Posixct)%>%
mutate(cumSum = cumsum(Metric_value),
lreg=lm(cumSum~HourMin)$coefficients[2],
correlation=cor(HourMin,cumSum),
cov=cov(HourMin,cumSum))
h <- g%>%
group_by(Date_Posixct)%>%
summarise(Correlation=first(correlation),
LIN=first(lreg),
cov=first(cov))%>%
mutate(wday = weekdays(Date_Posixct))%>%
arrange(Date_Posixct)
The below plot shows the cumulative summation of the peak hour volumes for each day in March 2018 for Detector 4069. The color scheme shows increasing covariance. Line to the bottom of the plot are Sundays and Saturdays.
The following plot displays straight volume across the peak hours for March for Detector 4069.
---
title: "R Notebook"
output: html_notebook
---

Set working directory and load necessary packages.

```{r setup, include=FALSE}
knitr::opts_knit$set(root.dir = "H:/Projects/11000/11155/TraffStudy/DataCollection/FreewayData/Detector Data/Volume Data")

##below are a list of packages required to run the markdown file
library(tidyverse)
library(lubridate)
library(gridExtra)
library(reshape2)
library(zoo)
library(imputeTS)
library(xts)
library(dygraphs)
library(d3heatmap)
library(ggthemes)
library(plotly)

```

Load the data for March. Load each file and combine into one data set. We load and combine each file in `r getwd()`. We only want to load the files that end in "_#.csv".

```{r explore March, echo=FALSE}

dir('./March 18')

```



```{r load data for March,warning=FALSE,echo=FALSE}
path <- './March 18'

### list out all the csv files that start with TT_ and then have a number. This will avoid loading in the unwanted aggregated files or VMT files.
###'^_\\d.' returns all files that end with _some_digit.csv.
list_csv <- dir(path=path,pattern = '*_\\d.csv')

myfiles <- lapply(paste(path,'/',list_csv,sep=''),
                  function(x) read.csv(x,stringsAsFactors = FALSE))

files <- mapply(cbind,myfiles,'filename' = list_csv,SIMPLIFY = F)

dat <- bind_rows(files)

rm(myfiles,files)

#create a dataset that we will edit....
March_18_dat <- dat
```

The column names in each file are also follows. 

```{r examine columns, echo=FALSE}

colnames(March_18_dat)
```

Lets fix the column headers first. There are no headers for the first 3 columns and also there appears to be a column at the end of the data frame "X.3"

```{r suymmary of dataset}
summary(March_18_dat$X.3)
```

All NAs.... remove.

```{r remove blank last column, echo=FALSE}

March_18_dat <- March_18_dat[,!names(March_18_dat) == "X.3"]

```

Focus on the first three columns;

```{r examine first 3 rows,echo=FALSE}
str(March_18_dat[1:3])
```

Column 1 is the detector ID.<br/>
Column 2 is the metric.<br/>
Column 3 is the date.<br/>


```{r change first 3 column names, echo=FALSE}

colnames(March_18_dat)[1:3] <- c('Detector_ID','Metric','Date')

```

Now lets change the data types in the columns.

We create a correctly formatted date field from the 'Date' field.<br/>
We label the field with displays the records type i.e. Density vs Speed vs Volume as 'Metric' and change it from a string to a factor field.<br/>
We also change Detector_ID from a numeric field to a factor field.

```{r metric and Date_posixct,echo=FALSE,include=FALSE}

March_18_dat$Metric <- as.factor(March_18_dat$Metric)
March_18_dat$Date_Posixct <- as.POSIXct(March_18_dat$Date,format="%Y/%m/%d")
March_18_dat$Detector_ID <- as.factor(March_18_dat$Detector_ID)
## Check that our dates worked out. We don't want any NAs.
table(March_18_dat$Date_Posixct)

```



```{r Time field}

colnames(March_18_dat)
```


There noon and midnight are labelled as strings. The rest of the columns are labelled as times formats (in strings) but are also inconsistent in format. Lets change noon and midnight to the same format as the other times. We also want to create a uniform format across all the column names.


```{r change noon midnight names,echo=FALSE}

colnames(March_18_dat)[colnames(March_18_dat)%in%
                         c('X1.AM','X2.AM','X3.AM','X4.AM','X5.AM','X6.AM',
                           'X7.AM','X8.AM','X9.AM','X10.AM','X11.AM',
                           'X1.PM','X2.PM','X3.PM','X4.PM','X5.PM','X6.PM',
                           'X7.PM','X8.PM','X9.PM','X10.PM','X11.PM','Noon','Midnight')] <-
  c('X1.00.AM','X2.00.AM','X3.00.AM','X4.00.AM','X5.00.AM','X6.00.AM',
    'X7.00.AM','X8.00.AM','X9.00.AM','X10.00.AM','X11.00.AM',
    'X1.00.PM','X2.00.PM','X3.00.PM','X4.00.PM','X5.00.PM','X6.00.PM',
    'X7.00.PM','X8.00.PM','X9.00.PM','X10.00.PM','X11.00.PM','X00.00.AM','X00.00.PM')

March_18_dat <- March_18_dat %>% select(Detector_ID,Metric,Date,Date_Posixct,filename,everything())


```

Melt the data. We want to change the data set from a wide format to a long format. Below are the top 6 rows.

```{r melt,echo=FALSE}

March_18_melt <- melt(March_18_dat,id.vars = c('Detector_ID','Metric','Date','Date_Posixct','filename'),variable.name = "Time",value.name = 'Metric_value')

head(March_18_melt)
```



```{r Date_Time, echo=FALSE}

#remove X
March_18_melt$Time <- gsub('X','',March_18_melt$Time)

March_18_melt$Date_Time <- as.POSIXct(paste(March_18_melt$Date,March_18_melt$Time),format="%Y/%m/%d %I.%M.%p")

March_18_melt$Hour <- hour(March_18_melt$Date_Time)

```

Filter to the PM Peak hours, 2pm to 7pm.

```{r Peak Hours}

PM_Peak <- c(14:19)

March_18_melt_PM_Peak <- March_18_melt %>%
  filter(Hour>=14)%>%
  filter(Hour<=19)

```

The new filtered data set only includes data between hour `r min(March_18_melt_PM_Peak$Hour)` and hour `r max(March_18_melt_PM_Peak$Hour)`.

Lets find what detectors have bad values. We will create a list of Detector that contain records listed as '-1'. We will omit these detectors from our intial analysis. 

```{r NA values}

var <- March_18_melt_PM_Peak %>% 
  group_by(Detector_ID)%>%
  summarise(NAs =sum(Metric_value<0))%>%
  arrange(NAs)%>%
  filter(NAs==0)%>%
  droplevels()
  
no_NAs <- as.vector(var$Detector_ID)
no_NAs
```

There are `r length(no_NAs)` detectors that do not contain a -1 value.

##Time Series

Lets pick Detector 70. We want to create a variable for each day and them sum the volumes for each day.


```{r det_70,echo=FALSE}

Det_70 <-  March_18_melt_PM_Peak %>%
  filter(Detector_ID==150)%>%
  filter(Metric=="Volume")%>%
  mutate(YearDay=yday(Date_Time),
         Weekday=weekdays(Date_Time,abbreviate=TRUE))%>%
  # filter(Weekday %in% c('Tue','Wed','Thu'))%>%
  group_by(Date_Posixct)%>%
  summarise(SumVol = sum(Metric_value))

head(Det_70)
```


Now we create a time series for the data.

```{r Time Series,fig.width=9,fig.height=5,echo=FALSE}

Det_70_ts_xts <- xts(Det_70$SumVol,order.by = Det_70$Date_Posixct)

dygraph(xts(Det_70$SumVol,order.by = Det_70$Date_Posixct))%>%
  dySeries(label = 'Total Vol.')%>%
  dyRangeSelector()
```

We decompose the Time Series to extract the trend, seasonality and remainder from the time series. 

```{r decompose data, echo=FALSE}
Det_70_ts <- ts(Det_70$SumVol,frequency=3)

Det_70_ts_deomp <- decompose(Det_70_ts)

Det_70_ts_stl <- stl(Det_70_ts,s.window = 'periodic')

midweek <- c('Tuesday','Wednesday','Thursday')

plot(Det_70_ts_stl)

day_select <-  cbind(data.frame(Det_70_ts_stl$time.series[,1:3]),Date=Det_70$Date_Posixct,Weekday=weekdays(Det_70$Date_Posixct))%>%
  # mutate(Resid = abs(remainder))%>%
  filter(Weekday %in% midweek)%>%
  arrange(remainder)%>%
  print()

```


Day selection is ?.

## Correlation

Show plots of volumes by day

```{r plot values}
y <- March_18_melt_PM_Peak %>%
  mutate(Weekday = weekdays(Date_Time,abbreviate=TRUE),
         YearDay=yday(Date_Time),
         MonthDay=day(Date_Time),
         HourMin = as.numeric(format(Date_Time,"%H.%M")))%>%
  filter(Metric=='Volume')%>%
  filter(Detector_ID%in%c(147:151))%>%
  filter(Weekday %in% c('Tue','Wed','Thu'))

  ggplot(y,aes(x=Metric_value,color=as.factor(Detector_ID)))+
  geom_density()+facet_wrap(~MonthDay,nrow=4,scales = 'free_x')
```


Create detector ~ day heatmap. Each cell represents the covariance of the difference between each measurement. We have picked 100 detectors from the list for display purposes. All of the detectors shown do not contain any -1 values. 

```{r heatmap for subset,fig.width=9,echo=FALSE}

levels <- last(levels(March_18_melt_PM_Peak$Detector_ID),100)

matrix <- March_18_melt_PM_Peak %>%
  mutate(Weekday = weekdays(Date_Time,abbreviate=TRUE),
         YearDay=yday(Date_Time),
         MonthDay=day(Date_Time),
         HourMin = as.numeric(format(Date_Time,"%H"))+
           as.numeric(format(Date_Time,"%M"))/60)%>%
  filter(Metric=='Volume')%>%
  filter(Detector_ID %in% levels)%>%
  filter(Detector_ID %in% no_NAs)%>%
  group_by(Date_Posixct,Detector_ID)%>%
  arrange(Date_Time)%>%
  mutate(cumSum = cumsum(Metric_value))%>%
  mutate(correlation = lm(cumSum~HourMin)$coefficients[2],
         cov=cov(HourMin,cumSum))%>%
  arrange(Date_Posixct)


matrix_spread <- matrix%>%
  group_by(Date_Posixct,Detector_ID)%>%
  # summarise(correlation=first(correlation))%>%
  summarise(Volume=sum(Metric_value))%>%
  # summarise(cov=first(cov))%>%
  spread(Date_Posixct,Volume)


vector <- as.character(matrix_spread$Detector_ID)
matrix2 <- matrix_spread[,-1] %>% as.data.frame()
rownames(matrix2) <- vector
d3heatmap(matrix2,scale = 'row',Colv = 'as-is',colors = 'Blues')
```

Cluster Days

```{r}

day_SDev <- March_18_melt_PM_Peak %>%
  mutate(Weekday = weekdays(Date_Time,abbreviate=TRUE),
         YearDay=yday(Date_Time),
         MonthDay=day(Date_Time),
         HourMin = as.numeric(format(Date_Time,"%H.%M")))%>%
  filter(Metric=='Volume')%>%
  group_by(Detector_ID,Date_Posixct)%>%
  summarise(SumVol = sum(Metric_value))%>%
  group_by(Date_Posixct)%>%
  summarise(SDev = sd(SumVol))%>%
  mutate(weekDay=weekdays(Date_Posixct,abbreviate=TRUE))

sundays <-day_SDev$Date_Posixct[day_SDev$weekDay%in%c('Sun')]

ggplot(day_SDev) + 
  geom_point(aes(Date_Posixct,SDev)) + 
  geom_rug(aes(Date_Posixct,SDev)) + 
  geom_vline(xintercept = sundays,linetype='dotted') +
  theme_tufte(ticks = F) +
  xlab("STD of the total volume across all Detectors") + 
  ylab("Date") + 
  theme(axis.title.x = element_text(vjust=-0.5), axis.title.y = element_text(vjust=1))

```

Cumulative running total

```{r cumsum, fig.width=9}

g <- March_18_melt_PM_Peak %>%
  mutate(Weekday = weekdays(Date_Time,abbreviate=TRUE),
         YearDay=yday(Date_Time),
         MonthDay=day(Date_Time),
         HourMin = as.numeric(format(Date_Time,"%H"))+
           as.numeric(format(Date_Time,"%M"))/60)%>%
  filter(Detector_ID==4069 & Metric=='Volume')%>%
  # filter(YearDay==65 & Metric=='Volume')%>%
  # filter(Detector_ID %in% no_NAs)%>%
  # filter(Metric=='Volume')%>%
  droplevels()%>%
  group_by(YearDay)%>%
  arrange(YearDay,HourMin)%>%
  # group_by(Detector_ID,Date_Posixct)%>%
  mutate(cumSum = cumsum(Metric_value),
         lreg=lm(cumSum~HourMin)$coefficients[2],
         correlation=cor(HourMin,cumSum),
         cov=cov(HourMin,cumSum))

h <- g%>%
  group_by(Date_Posixct)%>%
  summarise(Correlation=first(correlation),
            LIN=first(lreg),
            cov=first(cov))%>%
  mutate(wday = weekdays(Date_Posixct))%>%
  arrange(Date_Posixct)
```

The below plot shows the cumulative summation of the peak hour volumes for each day in March 2018 for Detector 4069. The color scheme shows increasing covariance. Line to the bottom of the plot are Sundays and Saturdays.

```{r plot cunsum,echo=FALSE}
plot <- ggplot(g) + 
  geom_line(aes(x=HourMin,y=cumSum,group=Date_Posixct,color=cov)) + 
  # geom_rug(aes(x=HourMin,y=cumSum)) + 
  # geom_smooth(aes(x=HourMin,y=diff))+
  theme_tufte(ticks = F) +
  xlab("Hour") + 
  ylab("CumSum") + 
  theme(axis.title.x = element_text(vjust=-0.5), axis.title.y = element_text(vjust=1))  

ggplotly(plot)
```

The following plot displays straight volume across the peak hours for March for Detector 4069.

```{r plot2   volume,echo=FALSE}
plot2 <- ggplot(g) + 
  geom_line(aes(x=HourMin,y=Metric_value,group=Date_Posixct,color=cov)) + 
  # geom_rug(aes(x=HourMin,y=cumSum)) + 
  # geom_smooth(aes(x=HourMin,y=diff))+
  theme_tufte(ticks = F) +
  xlab("Hour") + 
  ylab("Volume") + 
  theme(axis.title.x = element_text(vjust=-0.5), axis.title.y = element_text(vjust=1))  

ggplotly(plot2)

```



